Data is from https://gender-pay-gap.service.gov.uk/viewing/download

Load in packages and data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(infer)
# here is how I would read the data files one at a time
gender_pay_2017_18 <- read_csv("data/UK Gender Pay Gap Data - 2017 to 2018.csv")
## Rows: 10219 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): EmployerName, Address, PostCode, CompanyNumber, SicCodes, CompanyL...
## dbl (15): EmployerId, DiffMeanHourlyPercent, DiffMedianHourlyPercent, DiffMe...
## lgl  (1): SubmittedAfterTheDeadline
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gender_pay_2018_19 <- read_csv("data/UK Gender Pay Gap Data - 2018 to 2019.csv")
## Rows: 10459 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): EmployerName, Address, PostCode, CompanyNumber, SicCodes, CompanyL...
## dbl (15): EmployerId, DiffMeanHourlyPercent, DiffMedianHourlyPercent, DiffMe...
## lgl  (1): SubmittedAfterTheDeadline
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read in multiple files in one:
list_of_files <- list.files(path = "data", # specify the folder path
                            pattern = "\\.csv$", # only return files that end in csv
                            full.names = TRUE # foldeer path attached to the beginning of the file name
                            )

pay_combined <- read_csv(list_of_files, id = "file_name")
## Rows: 59315 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): EmployerName, Address, PostCode, CompanyNumber, SicCodes, CompanyL...
## dbl (15): EmployerId, DiffMeanHourlyPercent, DiffMedianHourlyPercent, DiffMe...
## lgl  (1): SubmittedAfterTheDeadline
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pay_combined

Clean the data

pay_trimmed <- pay_combined %>% 
  clean_names() %>% 
  mutate(year_starting = as.numeric(str_extract(file_name, "[0-9]+")), .before = 1) %>%  # take the first year listed in the file name
  select(contains(c("year", "post", "percent", "quartile", "size"))) # selecting desired columns

pay_trimmed

Find out the Scottish postcodes

Can be found many places but I noted them down from here: https://www.townscountiespostcodes.co.uk/postcodes-in-scotland/

scottish_postcodes <- c("AB", "DD", "PH", "FK", "G", "PA", "DG", "KA", "ML", "DG", "EH", "KY", "IV", "TD", "ZE", "HS", "KW")

Cut down the postcode column to just the first section and use this to determine if a company is in scotland or not:

pay_region <- pay_trimmed %>% 
  mutate(post_code = str_extract(post_code, "[A-Z0-9]+")) %>% 
  mutate(post_code = str_remove_all(post_code, "[0-9]+")) %>% 
  mutate(region = if_else(post_code %in% scottish_postcodes, "Scotland", "Rest of UK"), .after = post_code) %>% 
  select(-post_code)

pay_region

What pay differences can we see in Scotland year on year?

diff_mean_hourly_percent: mean % difference between male and female hourly pay (negative = women’s mean hourly pay is higher)

pay_region %>% 
  filter(region == "Scotland") %>% 
  ggplot() +
  geom_boxplot(aes(x = year_starting, y = diff_mean_hourly_percent, group = year_starting), colour = "blue", alpha = 0.5) +
  labs(
    title = "Gender pay differences across Scottish companies",
    subtitle = "2017 - 2023\n",
    x = "\nFinancial year starting",
    y = "Mean % difference in hourly pay\n(M - F)"
  ) +
  scale_x_continuous(breaks = c(2017, 2018, 2019, 2020, 2021, 2022, 2023)) +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.line = element_line(colour = "black")) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip()

Let’s check out 2023

pay_region %>% 
  filter(region == "Scotland") %>% 
  group_by(year_starting) %>% 
  summarise(nrows = n())

Are there any differences in bonuses given in Scottish companies?

male_bonus_percent: % of male employees paid a bonus

female_bonus_percent: % of female employees paid a bonus

pay_region %>% 
  filter(region == "Scotland") %>% 
  select(male_bonus_percent, female_bonus_percent) %>% 
  filter(male_bonus_percent != 0 & female_bonus_percent != 0) %>% 
  pivot_longer(cols = c("male_bonus_percent",
                        "female_bonus_percent"),
               names_to = "gender",
               values_to = "bonus_percent") %>% 
  ggplot() +
  geom_boxplot(aes(x = gender, y = bonus_percent, colour = gender), linewidth = 1.5, show.legend = FALSE) +
  labs(
    title = "Bonuses paid across Scottish companies",
    subtitle = "2017 - 2023\n",
    x = "",
    y = "% of employees paid a bonus"
  ) +
  scale_x_discrete(labels = c("Women", "Men")) +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.line = element_line(colour = "black"))

Can see that there is very little difference in the bonuses paid to men and women across Scottish companies.

Does company size affect pay disparity?

pay_region %>% 
  filter(region == "Scotland") %>% 
  mutate(employer_size = factor(employer_size, levels = c("Less than 250", "250 to 499", "500 to 999", "1000 to 4999", "5000 to 19,999", "20,000 or more", "Not Provided"))) %>% 
  filter(!employer_size %in% c("Not Provided", NA)) %>% 
  ggplot() +
  geom_boxplot(aes(x = employer_size, y = diff_mean_hourly_percent), colour = "blue") +
  labs(
    title = "Gender pay differences across Scottish companies",
    subtitle = "2017 - 2023\n",
    x = "\nCompany size",
    y = "Mean % difference in hourly pay\n(M - F)"
  ) +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.line = element_line(colour = "black")) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip()

This plot appears to show that the larger a company is, the larger the pay gap between men and women (with men earning more per hour).

How does Scotland compare to the rest of the UK year on year?

pay_region %>% 
  group_by(region, year_starting) %>% 
  mutate(region = factor(as.factor(region), levels = c("Scotland", "Rest of UK"))) %>% 
  summarise(mean_diff_hourly_across_companies = mean(diff_mean_hourly_percent)) %>% 
  ggplot() +
  geom_line(aes(x = year_starting, y = mean_diff_hourly_across_companies, colour = region),
            linewidth = 2) + 
  scale_x_continuous(breaks = c(2017, 2018, 2019, 2020, 2021, 2022, 2023)) +
  labs(
    title = "Average gender pay differences between Scotland and the rest of the UK",
    subtitle = "2017 - 2023\n",
    x = "\nFinancial year starting",
    y = "Overall mean % difference in hourly pay\n(M - F)",
    colour = ""
  ) +
  theme_minimal() +
   theme(panel.grid = element_blank(),
         axis.line = element_line(colour = "black")) +
  ylim(c(0, 15))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

How does Edinburgh compare to London when it comes to the gender pay gap?

First divide region variable appropriately:

# https://postcodefinder.net/england/london
london_postcodes <- c("BR", "CR", "DA", "E", "EC", "EN", "HA", "IG", "KT", "N", "NW", "RM", "SE", "SM", "SW", "TN", "TW", "UB", "W", "WC")

pay_lon_ed <- pay_trimmed %>% 
  mutate(post_code = str_extract(post_code, "[A-Z0-9]+")) %>% 
  mutate(post_code = str_remove_all(post_code, "[0-9]+")) %>% 
  mutate(region = case_when(
    post_code %in% london_postcodes ~ "London",
    post_code == "EH" ~ "Edinburgh",
    .default = "Rest of UK"), .after = post_code) %>% 
  mutate(region = as.factor(region)) %>% 
  select(-post_code)

pay_lon_ed
pay_lon_ed %>% 
  filter(region != "Rest of UK") %>% # filter out irrelevant companies
  ggplot() +
  geom_boxplot(aes(x = year_starting, y = diff_mean_hourly_percent, group = year_starting), alpha = 0.5) +
    facet_wrap(~ region) +
  labs(
    title = "Gender pay differences across Edinburgh and London companies",
    subtitle = "2017 - 2023\n",
    x = "\nFinancial year starting",
    y = "Mean % difference in hourly pay\n(M - F)"
  ) +
  scale_x_continuous(breaks = c(2017, 2018, 2019, 2020, 2021, 2022, 2023)) +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.line = element_line(colour = "black")) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip()

We can see here that, across the years 2017 to 2023, the disparity in hourly pay between men and women is very similar between Edinburgh- and London-based companies. Specifically, the values of mean percentage difference in pay are mostly positive which indicates men are paid more than women.

pay_lon_ed %>% 
  filter(region != "Rest of UK") %>% 
  group_by(region) %>% 
  summarise(mean_diff_hourly_across_companies = mean(diff_mean_hourly_percent),
            sd_diff_hourly_across_companies = sd(diff_mean_hourly_percent))

The total mean difference for Edinburgh companies across all years is 12.2% (SD: 13.3%) and for London companies is 13.6% (SD: 14.7%). These positive values indicate men being paid more hourly on average.

Hypothesis test comparing means between Edinburgh and London

hyp_test_data <- pay_lon_ed %>% 
  filter(region != "Rest of UK") %>% 
  select(region, diff_mean_hourly_percent)

# visualise data
hyp_test_data %>% 
  ggplot(aes(x = region, y = diff_mean_hourly_percent)) +
  geom_boxplot()

We can see the interquartile ranges of the distributions overlap substantially, and the London values are much wider spread.

Will now create the null distribution:

null_distribution <- hyp_test_data %>% 
  specify(diff_mean_hourly_percent ~ region) %>% 
  hypothesize(null = "independence") %>% #the null hypothesis is there is no relationship i.e. they are independent
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("London", "Edinburgh")) 
## Dropping unused factor levels Rest of UK from the supplied explanatory variable 'region'.
head(null_distribution)

We can see here that we have 1000 (only 6 shown) replicated samples, and each has a difference in means shown (stat).

Now we have our null distribution, let’s calculate our observed statistic and then visualise the null distribution and where the observed statistic lies on the distribution.

observed_stat <- hyp_test_data %>% 
  specify(diff_mean_hourly_percent ~ region) %>%
  calculate(stat = "diff in means", order = c("London", "Edinburgh"))
## Dropping unused factor levels Rest of UK from the supplied explanatory variable 'region'.
observed_stat

So the observed statistic is that the total mean difference in percent in London is 1.4% higher than in Edinburgh. Will now test to see if this is significant at the 0.05 level.

null_distribution %>%
  visualise() +
  shade_p_value(obs_stat = observed_stat, direction = "right")

So we see from the visualisation that the observed statistic is towards the edge of the null distribution. So there would be a small probability of getting a more extreme value than the observed value under the null hypothesis (that the difference between cities = 0)

Let’s calculate the p-value to be sure.

p_value <- null_distribution %>%
  get_p_value(obs_stat = observed_stat, direction = "right")

p_value

P is less than 0.05, meaning this difference is statistically significant. I can therefore reject the null hypothesis and conclude that I have found enough evidence in the data to to suggest that the total mean percentage difference in hourly pay (Male - Female) across London companies is statistically significantly greater than across Edinburgh companies.